cap log close

clear all
ssc install unique, replace

use "../../data/raw data/shoko_ULURU/shoko_raw.dta"

destring 資本金 month year, replace force
encode 業種大分類, gen(ind)


* Drop individuals and unions.
tab ind
keep if ind==3|ind==4|ind==5 /* manufacturing, retail, service */
gen manu = (ind==3)
gen reta = (ind==4)
gen serv = (ind==5)

gen emp = manu + reta + serv

foreach t in 1945 1946 1947 1948 1949 1950 1951 1952{
	gen manu_`t' = (ind==3&year==`t')
	gen reta_`t' = (ind==4&year==`t')
	gen serv_`t' = (ind==5&year==`t')
}

foreach t in 1945 1946 1947 1948 1949 1950 1951 1952{
	gen emp_`t' = manu_`t' + reta_`t' + serv_`t'
}

tab year

* Some obs has year>1952, which is likely a typo
* hist year if year<=1952
tab ind

*** large part of the sample are established after 1945 ***

* Aggregate all establishments prior to 1945 as 1926 (beginning of Showa).
gen year_original = year
replace year = 1926 if year_original < 1945

gen manu_1926 = (ind==3&year==1926)
gen reta_1926 = (ind==4&year==1926)
gen serv_1926 = (ind==5&year==1926)
gen emp_1926 = manu_1926 + reta_1926 + serv_1926


tab matched_address
* About 13% has missing addresses. Not bad, given that aki-gun etc are also included in data.
* Out of 197 blocks, 192 blocks are contained in data.
* Not bad but should consider further cleaning of addreesses
unique matched_address
rename matched_address 旧町名

rename 資本金 fsize
sum fsize, d
gen l_fsize = ln(fsize)
hist l_fsize

merge m:1 旧町名 using "../../data/raw data/blockinfo.dta"
keep if _merge==3

drop if 旧町名=="似島町"  /* drop island*/

* Convert the area size from m^2 to km^2
replace area = area/1000000


* To show in a map, need to collapse the data to the block level (_ID should uniquely identify obs.)
gen c = 1
collapse (sum) manu* reta* serv* area c, by(_ID)
gen emp = manu + reta + serv

gen manu_dens = manu/area
gen reta_dens = reta/area
gen serv_dens = serv/area
gen emp_dens = manu_dens + reta_dens + serv_dens
gen nonmanu_dens = reta_dens + serv_dens
foreach t in 1926 1945 1946 1947 1948 1949 1950 1951 1952{
	gen empdens_`t' = (manu_`t' + reta_`t' + serv_`t')/area
}

** log density 
gen l_emp_dens = ln(emp_dens)
gen l_manu_dens = ln(manu_dens)
gen l_serv_dens = ln(serv_dens)
gen l_reta_dens = ln(reta_dens)
gen l_nonmanu_dens = ln(nonmanu_dens)

** Figure A8 in paper
#d;
twoway(scatter l_manu_dens  l_nonmanu_dens , sort color(black) msize(small))
 (lfit l_manu_dens  l_nonmanu_dens, sort color(cranberry) lwidth(medthick)),
	scheme(s1color) plotregion(lwidth(none) ilwidth(none)) 
	legend(off)
	xtitle("Log establishment density (Retail and Services)") xlabel(, angle(horizontal))
	ytitle("Log establishment density (Manufacturing)") ylabel(, angle(horizontal)) ;
	graph export "../../output/figure/figure_twosector_estcount.pdf", as(pdf)  replace ;
#d cr

